library(data.table)
library(tidyverse)
library(matrixStats)
library(lubridate)
library(ggraph)
library(ggrepel)
library(ape)
library(ggtree)
library(patchwork)
library(furrr)
library(caper)
library(lme4)

Load data

Load allele count data for each replicate. We only consider those that have generated high quality consensus genomes from both replicates.

run1_files <- Sys.glob("./data/allele_counts/run_1/*.tsv")
run2_files <- Sys.glob("./data/allele_counts/run_2/*.tsv")
replicate_meta <- fread("./Processed_data/keep_replicates_meta.csv", data.table = FALSE, 
    sep = ",", fill = TRUE) %>% as_tibble()

run1_files <- run1_files[gsub("\\_ac.tsv", "", gsub(".*/", "", run1_files)) %in% 
    replicate_meta$central_sample_id]
run2_files <- run2_files[gsub("\\_ac.tsv", "", gsub(".*/", "", run2_files)) %in% 
    replicate_meta$central_sample_id]

We only consider intra host single nucleotide variants (iSNVs) that are present with a MAF of at least 0.001 and at least 10 supporting reads in both replicates.

run1_list <- map(run1_files, function(x) {
    df <- fread(x, data.table = FALSE) %>% as_tibble
    df$sample <- rep(gsub("\\.tsv", "", gsub(".*/", "", x)), nrow(df))
    return(df)
})
names(run1_list) <- gsub("_ac\\.tsv", "", gsub(".*/", "", run1_files))
run2_list <- map(run2_files, function(x) {
    df <- fread(x, data.table = FALSE) %>% as_tibble
    df$sample <- rep(gsub("\\.tsv", "", gsub(".*/", "", x)), nrow(df))
    return(df)
})
names(run2_list) <- gsub("_ac\\.tsv", "", gsub(".*/", "", run2_files))

stopifnot(all(names(run1_list) %in% names(run2_list)))
all_sample_names <- names(run1_list)

run_consensus_df <- imap_dfr(all_sample_names, function(s, i) {
    mafA <- run1_list[[s]]
    mafB <- run2_list[[s]]
    
    # check there are replicates
    if (is.null(mafA) || nrow(mafA) < 29000) {
        return(mafB)
    }
    if (is.null(mafB) || nrow(mafB) < 29000) {
        return(mafA)
    }
    stopifnot(all(mafA$POS == mafB$POS))
    
    countA <- sum(mafA$Good_depth)
    countB <- sum(mafB$Good_depth)
    
    inA <- mafA[, c("Count_A", "Count_C", "Count_G", "Count_T")] >= pmax(10, (0.001 * 
        mafA$Good_depth))
    inB <- mafB[, c("Count_A", "Count_C", "Count_G", "Count_T")] >= pmax(10, (0.001 * 
        mafB$Good_depth))
    
    if (countA > countB) {
        mafA[, c("Count_A", "Count_C", "Count_G", "Count_T")][!(inA & inB)] <- 0
        return(mafA)
    } else {
        mafB[, c("Count_A", "Count_C", "Count_G", "Count_T")][!(inA & inB)] <- 0
        return(mafB)
    }
})
run_consensus_df$sample <- gsub("_ac", "", run_consensus_df$sample)
run_consensus_df$Good_depth <- rowSums(run_consensus_df[, c("Count_A", "Count_C", 
    "Count_G", "Count_T")])

# fwrite(run_consensus_df, file =
# './Processed_data/consensus_allele_counts_mixture.csv')
run_consensus_df <- fread("./Processed_data/consensus_allele_counts_mixture.csv", 
    data.table = FALSE)

Calculate sample counts

stopifnot(all(replicate_meta$central_sample_id %in% names(run1_list)))
temp <- replicate_meta %>% filter(central_sample_id %in% names(run1_list)) %>% group_by(biosample_source_id) %>% 
    summarise(count = n()) %>% filter(biosample_source_id != "None")

sample_counts <- tibble(nsamples = length(run1_files), nhosts = length(run1_files) - 
    sum(temp$count - 1), nhosts_with_multi = sum(temp$count > 1))
knitr::kable(sample_counts)
nsamples nhosts nhosts_with_multi
1032 981 43

We now filter iSNVs that were not found using shearwater. We also exclude samples CAMB-728D4 and CAMB-79345 as they had an unusually high number of minority variants (these have already been filtered from the shearwater results).

shearwater_calls <- fread("./data/Shearwater_calls_20200714_annot.txt") %>% as_tibble() %>% 
    filter(!mut %in% c("-", "INS")) %>% filter(str_length(ref) == str_length(mut))

shearwater_calls <- map_dfr(1:nrow(shearwater_calls), function(i) {
    temp <- map2_dfr(unlist(str_split(shearwater_calls$ref[[i]], "")), unlist(str_split(shearwater_calls$mut[[i]], 
        "")), ~{
        temp <- shearwater_calls[i, ]
        temp$ref <- .x
        temp$mut <- .y
        return(temp)
    })
    temp$pos <- temp$pos[[1]] + 0:(nrow(temp) - 1)
    return(temp)
})

shearwater_complex_calls <- fread("./data/Shearwater_calls_20200714_annot.txt", data.table = FALSE) %>% 
    as_tibble()
length(unique(shearwater_complex_calls$pos[shearwater_complex_calls$vaf >= 0.95]))
## [1] 948
length(unique(shearwater_complex_calls$pos[shearwater_complex_calls$vaf < 0.95]))
## [1] 5625
run_consensus_df <- run_consensus_df %>% filter(sample %in% unique(shearwater_calls$sampleID))
colnames(run_consensus_df) <- c("CHR", "POS", "A", "C", "G", "T", "Good_depth", "sample")

# filter shearwater calls to include just those with good replicates
shearwater_calls <- shearwater_calls %>% filter(sampleID %in% unique(run_consensus_df$sample))

sample_allele_counts <- split(run_consensus_df, run_consensus_df$sample)

run_consensus_df <- map_dfr(unique(shearwater_calls$sampleID), function(sample) {
    # consensus of lofreq calls
    sw_calls <- shearwater_calls %>% filter(sampleID == sample)
    
    sample_count <- sample_allele_counts[[sample]]
    
    consA <- (sample_count[, c("A", "C", "G", "T")] > 0) & (sample_count[, c("A", 
        "C", "G", "T")] == rowMaxs(as.matrix(sample_count[, c("A", "C", "G", "T")])))
    rownames(consA) <- sample_count$POS
    
    sw_calls <- sw_calls %>% filter(pos %in% sample_count$POS)
    
    consA[cbind(sw_calls$pos, sw_calls$mut)] <- TRUE
    consA[cbind(sw_calls$pos, sw_calls$ref)] <- TRUE
    
    sample_count[, c("A", "C", "G", "T")][!consA] <- 0
    
    return(sample_count)
})

Distribution of iSNVs across the genome

Only consider those calls with a coverage of at least 1000. Split into variable and consensus sites.

MINMAF <- 0.01
MINDEPTH <- 1000

# Store all counts
all_run_consensus_df <- run_consensus_df

# Filter to those variable sites
run_consensus_df <- run_consensus_df %>% filter(Good_depth >= MINDEPTH)

keep <- rowSums((run_consensus_df[, c("A", "C", "G", "T")]/run_consensus_df$Good_depth) >= 
    MINMAF) > 1
run_consensus_df <- as_tibble(run_consensus_df[keep, ])

We ensure only the initial sample for each patient is considered.

meta_multi <- replicate_meta %>% filter(biosample_source_id != "None") %>% group_by(biosample_source_id) %>% 
    summarise(samples = list(central_sample_id[order(collection_date)]), dates = list(collection_date[order(collection_date)]), 
        n_samples = length(unique(central_sample_id)), n_dates = length(unique(collection_date))) %>% 
    filter(n_samples > 1)

repeated_samples <- unlist(map(meta_multi$samples, ~.x[-1]))

Distribution of shared iSNVs across consensus phylogeny

We now plot the phylogeny built from the consensus assemblies of our samples. Links are drawn between the tips of this tree if the samples share an intrahost variant. To avoid this plot being dominated by the most highly variable sites we exclude sites seen in more than 2.5% of samples.

all_seqs <- ape::read.dna("./data/consensus/MA_elan.20200529.consensus.filt.fasta", 
    format = "fasta")
full_tree <- read.tree("./data/consensus/elan.20200529.consensus.filt.mask.tree")
full_tree <- root(full_tree, outgroup = "MN908947.3", resolve.root = TRUE)
full_tree <- drop.tip(full_tree, full_tree$tip.label[!full_tree$tip.label %in% all_sample_names])

# determine which minor alleles are seen at the consensus level
consensus_cov <- map_dfr(unique(run_consensus_df$POS), function(i) {
    bf <- as_tibble(t(ape::base.freq(all_seqs[, i]))) %>% add_column(POS = i, .before = 1)
    return(bf)
})

# exclude positions seen in more than 2.5% of samples
common_sites <- run_consensus_df %>% group_by(POS) %>% summarise(count = n()) %>% 
    filter(count > 24)

# pre split dataframe for speed
representative_sample_names <- all_sample_names[!all_sample_names %in% repeated_samples]
samples_list <- map(representative_sample_names, ~run_consensus_df %>% filter(sample == 
    .x) %>% filter(!POS %in% common_sites$POS))
names(samples_list) <- representative_sample_names

pw <- combinat::combn(length(samples_list), 2)

MINMAF <- 0.01

pairwise_shared_maf <- map2_dfr(pw[1, ], pw[2, ], function(indexA, indexB) {
    mafA <- samples_list[[indexA]]
    mafB <- samples_list[[indexB]]
    
    inboth <- mafA$POS[mafA$POS %in% mafB$POS]
    
    if (length(inboth) < 1) {
        return(tibble())
    }
    mafA <- mafA[match(inboth, mafA$POS), ]
    mafB <- mafB[match(inboth, mafB$POS), ]
    
    sim <- map_dfr(1:nrow(mafA), ~{
        a <- mafA[.x, c("A", "C", "G", "T")]
        b <- mafB[.x, c("A", "C", "G", "T")]
        a <- (a >= (MINMAF * mafA$Good_depth[[.x]])) & (a < max(a))
        b <- (b >= (MINMAF * mafB$Good_depth[[.x]])) & (b < max(b))
        cons <- consensus_cov[consensus_cov$POS == mafA$POS[[.x]], ]
        tibble(cons_count = sum(a & b & (cons[2:5] > 0)), non_cons_count = sum(a & 
            b) - cons_count, cons_alles = list(c("a", "c", "g", "t")[a & b & (cons[2:5] > 
            0)]), non_cons_alles = list(c("a", "c", "g", "t")[a & b & (cons[2:5] == 
            0)]))
    })
    
    poses <- mafA$POS[sim[, 1] > 0]
    poses_maf <- unlist(sim[sim[, 1] > 0, 3])
    
    cons_poses <- mafA$POS[sim[, 2] > 0]
    cons_maf <- unlist(sim[sim[, 2] > 0, 4])
    
    sim <- colSums(sim[, 1:2])
    
    return(tibble(sampleA = representative_sample_names[[indexA]], sampleB = representative_sample_names[[indexB]], 
        cons_count = sim["cons_count"], non_cons_count = sim["non_cons_count"], cons_pos = list(poses), 
        cons_pos_maf = list(poses_maf), non_cons_pos = list(cons_poses), non_cons_pos_maf = list(cons_maf)))
})

temp <- shearwater_complex_calls %>% filter(!sampleID %in% repeated_samples) %>% 
    filter(r1_vaf < 1 & r2_vaf < 1)

# filter those seen in more than 2% of samples
high_prev <- names(table(paste(temp$pos, temp$mut)))[table(paste(temp$pos, temp$mut)) > 
    21]
complex_calls <- split(paste(temp$pos, temp$mut), temp$sampleID)

pairwise_shared_maf$IHV <- map2_dbl(pairwise_shared_maf$sampleA, pairwise_shared_maf$sampleB, 
    ~{
        sum((complex_calls[[.x]] %in% complex_calls[[.y]]) & (!complex_calls[[.x]] %in% 
            high_prev))
    })
pairwise_shared_maf$IHV_all <- map2_dbl(pairwise_shared_maf$sampleA, pairwise_shared_maf$sampleB, 
    ~{
        sum((complex_calls[[.x]] %in% complex_calls[[.y]]) & (!complex_calls[[.x]] %in% 
            high_prev))
    })

# plot links against tree
d = fortify(full_tree)
d = subset(d, isTip)
tip_order <- with(d, label[order(y, decreasing = T)])
plotdf <- melt(pairwise_shared_maf[, c(1, 2, 9, 10)], value.name = "count") %>% filter(count > 
    0) %>% as_tibble()
plotdf$variable <- as.character(plotdf$variable)
plotdf$variable[plotdf$variable == "IHV"] <- "within-host variants < 2% of samples"
plotdf$variable[plotdf$variable == "IHV_all"] <- "all within-host variants"

plotdf <- plotdf %>% filter(sampleA %in% full_tree$tip.label) %>% filter(sampleB %in% 
    full_tree$tip.label)

nodes <- tibble(nodes = tip_order, pos = length(tip_order):1)
gr <- igraph::graph_from_data_frame(plotdf, vertices = nodes)

link <- ggraph(gr, layout = "manual", y = rep(0, length(nodes$pos)), x = nodes$pos) + 
    geom_edge_arc(aes(color = count), alpha = 0.1, fold = TRUE) + scale_x_continuous(breaks = 1:length(tip_order), 
    labels = tip_order) + facet_wrap(~variable) + coord_flip() + theme(strip.text.x = element_text(size = 10))

gg <- ggtree(full_tree)

gg + link

ggsave(filename = "Figures/phylogeny_with_maf_links2.pdf", width = 12, height = 7)
ggsave(filename = "Figures/phylogeny_with_maf_links2.png", width = 12, height = 7)

Identifying possible mixed infections

Potential mixed infections were identified using a linear model by testing whether the allele frequencies in a sample could be better explained by the inclusion of an additional consensus genome from the COG-UK dataset of the 29th May 2020. Additional samples mixtures were considered if the addition of a COG-UK consensus genome could explain at least 2 iSNVs and have at most 1 variant that was not found in the alleles of the sample. This identified 54 putative mixtures which were then screened manually to obtain 36 potentially mixed samples.

fwrite(run_consensus_df, "./Processed_data/run_consensus_mixture.csv", sep = ",", 
    quote = FALSE, row.names = FALSE)
run_consensus_df <- fread("./Processed_data/run_consensus_mixture.csv", data.table = FALSE) %>% 
    as_tibble()

all_seqs <- ape::read.dna("./data/consensus/MA_elan.20200529.consensus.filt.fasta", 
    format = "fasta")
aln_length <- ncol(all_seqs)
all_seqs[!all_seqs %in% as.DNAbin(c("a", "c", "g", "t"))] <- as.DNAbin("n")
names <- unlist(fread(file = "./data/consensus/snp_dist_MA_elan.20200529.consensus.filt.mask.csv", 
    header = F, nrows = 1, sep = "\t"))
snp_dist <- fread("./data/consensus/snp_dist_MA_elan.20200529.consensus.filt.mask.csv", 
    data.table = FALSE, skip = 1, header = FALSE) %>% as_tibble()
snp_dist$V1 <- names[snp_dist$V1 + 2]
snp_dist$V2 <- names[snp_dist$V2 + 2]

dups <- snp_dist %>% group_by(V1) %>% summarise(dups = list(V2))
dups <- unlist(dups$dups)
all_seqs_dedup <- rownames(all_seqs)[!rownames(all_seqs) %in% dups]

all_samples <- unique(run_consensus_df$sample)

plan(multiprocess)

mixture_tests <- furrr::future_map_dfr(all_samples, function(sA) {
    print(sA)
    aA_orig <- all_seqs[rownames(all_seqs) == sA, ]
    
    mA_orig <- matrix(0, nrow = 4, ncol = aln_length)
    mA_orig[1, aA_orig == as.DNAbin("a")] <- 1
    mA_orig[2, aA_orig == as.DNAbin("c")] <- 1
    mA_orig[3, aA_orig == as.DNAbin("g")] <- 1
    mA_orig[4, aA_orig == as.DNAbin("t")] <- 1
    
    sfA_orig <- run_consensus_df %>% filter(sample == sA)
    fA_orig <- mA_orig
    fA_orig[, sfA_orig$POS] <- t(sfA_orig[, c("A", "C", "G", "T")]/rowSums(sfA_orig[, 
        c("A", "C", "G", "T")]))
    mA_orig[, sfA_orig$POS] <- 1 * t(t(fA_orig[, sfA_orig$POS, drop = FALSE]) == 
        colMaxs(fA_orig[, sfA_orig$POS, drop = FALSE]))
    
    pairwise_res <- map_dfr(1:length(all_seqs_dedup), ~{
        print(.x)
        sB <- all_seqs_dedup[[.x]]
        
        aB <- all_seqs[rownames(all_seqs) == sB, ]
        
        mB <- matrix(0, nrow = 4, ncol = aln_length)
        mB[1, aB == as.DNAbin("a")] <- 1
        mB[2, aB == as.DNAbin("c")] <- 1
        mB[3, aB == as.DNAbin("g")] <- 1
        mB[4, aB == as.DNAbin("t")] <- 1
        
        
        keep <- (colSums(mA_orig) > 0) & (colSums(mB) > 0)
        mA <- mA_orig[, keep]
        mB <- mB[, keep]
        fA <- fA_orig[, keep]
        
        model_data <- tibble(freq = c(unlist(fA)), consensusA = c(unlist(mA)), consensusB = c(unlist(mB)))
        
        model_data <- model_data[(rowSums(model_data == 1) != 3) & (rowSums(model_data == 
            0) != 3), , drop = FALSE]
        # model_data <-
        # model_data[model_data$consensusA!=model_data$consensusB,,drop=FALSE]
        n_extra_explained <- sum(((model_data$freq > 0) & (model_data$consensusB > 
            0)) & (model_data$consensusA <= 0))
        n_mismatch <- sum((model_data$freq <= 0) & (model_data$consensusB > 0))
        
        if (nrow(model_data) < 1) {
            return(tibble(sampleA = sA, sampleB = sB, estimate = NA, std.error = NA, 
                statistic = NA, p.value = NA, n_extra_explained = n_extra_explained, 
                n_mismatch = n_mismatch))
        }
        
        model <- broom::tidy(lm(freq ~ -1 + consensusA + consensusB, model_data))[2, 
            ] %>% add_column(sampleB = sB, .before = 1) %>% add_column(sampleA = sA, 
            .before = 1)
        model$term <- NULL
        model$n_extra_explained <- n_extra_explained
        model$n_mismatch <- n_mismatch
        return(model)
        
    }) %>% arrange(p.value) %>% filter(n_mismatch < 3)
    
    return(pairwise_res)
    
}, .progress = TRUE)

write.csv(mixture_tests, "./Processed_data/mixture_tests_revised.csv", sep = ",", 
    quote = FALSE, row.names = FALSE)

Load the preprocessed results from the code above.

mixture_tests <- fread("./Processed_data/mixture_tests_revised.csv", data.table = FALSE) %>% 
    as_tibble()
mixture_tests_mm <- mixture_tests %>% filter(n_mismatch < 2) %>% filter(n_extra_explained > 
    1) %>% filter(estimate > 0) %>% arrange(p.value)
mixture_tests_mm <- mixture_tests_mm[!duplicated(mixture_tests_mm$sampleA), ]

# remove those that on manual inspection don't appear to be convincingly mixtures
mixture_tests_mm <- mixture_tests_mm[!mixture_tests_mm$sampleA %in% c("CAMB-7A326", 
    "CAMB-75E93", "CAMB-71F76", "CAMB-71D30", "CAMB-7657F", "CAMB-776F3", "CAMB-76B31", 
    "CAMB-76BC8", "CAMB-7A71B", "CAMB-7780C", "CAMB-72494", "CAMB-79433", "CAMB-77CB5", 
    "CAMB-775E7", "CAMB-75FFA", "CAMB-7A890", "CAMB-76B9B", "CAMB-7674C", "CAMB-77FBC"), 
    ]

generate plots of the resulting mixtures

all_meta <- fread("./data/consensus/cog_2020-05-08_metadata.csv", data.table = FALSE) %>% 
    as_tibble()
all_meta$sample <- map_chr(str_split(all_meta$sequence_name, "/"), ~.x[[2]])
mixture_tests_mm$lineageA <- all_meta$lineage[match(mixture_tests_mm$sampleA, all_meta$sample)]
mixture_tests_mm$lineageB <- all_meta$lineage[match(mixture_tests_mm$sampleB, all_meta$sample)]
aln_length <- ncol(all_seqs)

pair_plots_data <- map_dfr(1:nrow(mixture_tests_mm), function(i) {
    print(i)
    sA <- mixture_tests_mm$sampleA[[i]]
    sB <- mixture_tests_mm$sampleB[[i]]
    
    aA_orig <- all_seqs[rownames(all_seqs) == sA, ]
    
    mA_orig <- matrix(0, nrow = 4, ncol = aln_length)
    mA_orig[1, aA_orig == as.DNAbin("a")] <- 1
    mA_orig[2, aA_orig == as.DNAbin("c")] <- 1
    mA_orig[3, aA_orig == as.DNAbin("g")] <- 1
    mA_orig[4, aA_orig == as.DNAbin("t")] <- 1
    
    sfA_orig <- run_consensus_df %>% filter(sample == sA)
    fA_orig <- mA_orig
    fA_orig[, sfA_orig$POS] <- t(sfA_orig[, c("A", "C", "G", "T")]/rowSums(sfA_orig[, 
        c("A", "C", "G", "T")]))
    mA_orig[, sfA_orig$POS] <- 1 * t(t(fA_orig[, sfA_orig$POS, drop = FALSE]) == 
        colMaxs(fA_orig[, sfA_orig$POS, drop = FALSE]))
    
    aB <- all_seqs[rownames(all_seqs) == sB, ]
    
    mB <- matrix(0, nrow = 4, ncol = aln_length)
    mB[1, aB == as.DNAbin("a")] <- 1
    mB[2, aB == as.DNAbin("c")] <- 1
    mB[3, aB == as.DNAbin("g")] <- 1
    mB[4, aB == as.DNAbin("t")] <- 1
    
    keep <- (colSums(mA_orig) > 0) & (colSums(mB) > 0)
    colnames(mA_orig) <- 1:ncol(mA_orig)
    colnames(mB) <- 1:ncol(mB)
    colnames(fA_orig) <- 1:ncol(fA_orig)
    mA <- mA_orig[, keep]
    mB <- mB[, keep]
    fA <- fA_orig[, keep]
    
    model_data <- tibble(pos = rep(as.numeric(colnames(fA)), each = 4), base = rep(c("A", 
        "C", "G", "T"), ncol(fA)), freq = c(unlist(fA)), consensusA = c(unlist(mA)), 
        consensusB = c(unlist(mB)))
    
    colnames(model_data)[3:5] <- c("frequency", "consensus", "co-infection")
    # sprintf('%s, lineage %s', sA, mixture_tests_mm$lineageA[[i]]), sprintf('%s,
    # lineage %s', sB, mixture_tests_mm$lineageB[[i]]))#,
    # mixture_tests_mm$n_extra_explained[[i]]))
    
    model_data <- model_data[(rowSums(model_data[, 3:5] == 1) != 3) & (rowSums(model_data[, 
        3:5] == 0) != 3), ]
    
    model_data$lineage <- "other"
    model_data$lineage[model_data[, 5] == 1] <- colnames(model_data)[[5]]
    model_data$lineage[model_data[, 4] == 1] <- colnames(model_data)[[4]]
    
    model_data$pair <- sprintf("%s (%s) - %s (%s)", sA, mixture_tests_mm$lineageA[[i]], 
        sB, mixture_tests_mm$lineageB[[i]])
    
    return(model_data)
})
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 9
## [1] 10
## [1] 11
## [1] 12
## [1] 13
## [1] 14
## [1] 15
## [1] 16
## [1] 17
## [1] 18
## [1] 19
## [1] 20
## [1] 21
## [1] 22
## [1] 23
## [1] 24
## [1] 25
## [1] 26
## [1] 27
## [1] 28
## [1] 29
## [1] 30
## [1] 31
## [1] 32
## [1] 33
## [1] 34
## [1] 35
## [1] 36
## [1] 37
pair_plots_data$lineage <- factor(pair_plots_data$lineage, levels = c("consensus", 
    "co-infection", "other"))

gg <- ggplot(pair_plots_data, aes(x = pos, y = frequency, col = lineage)) + geom_point(size = 2, 
    alpha = 0.5) + scale_x_continuous(limits = c(1, 29903)) + scale_y_continuous(breaks = c(0, 
    0.25, 0.5, 0.75, 1)) + scale_color_manual(values = c("#4393c3", "#d6604d", "#bdbdbd")) + 
    facet_wrap(~pair, ncol = 5) + ylab("freq") + theme_bw(base_size = 12) + xlab("position") + 
    ylab("frequency")
gg

# pp <- patchwork::wrap_plots(pair_plots,guides = 'collect') +
# patchwork::plot_layout(ncol = 1)
ggsave(gg, filename = "Figures/potential_mixtures_revised.pdf", width = 20, height = 20, 
    limitsize = FALSE)


gg <- ggplot(pair_plots_data %>% filter(pair %in% c("CAMB-7217F (B.1) - CAMB-79EF9 (B.3)", 
    "CAMB-7A57B (B.2.1) - CAMB-75ADB (B.1.1)", "CAMB-75BE7 (B.3) - CAMB-76621 (B.3)")), 
    aes(x = pos, y = frequency, col = lineage)) + geom_point(size = 2, alpha = 0.5) + 
    scale_x_continuous(limits = c(1, 29903)) + scale_y_continuous(breaks = c(0, 0.25, 
    0.5, 0.75, 1)) + scale_color_manual(values = c("#4393c3", "#d6604d", "#bdbdbd")) + 
    facet_wrap(~pair, ncol = 1) + ylab("freq") + theme_bw(base_size = 16) + xlab("position") + 
    ylab("frequency")
gg

ggsave(gg, filename = "Figures/potential_mixtures_selected_3.png", width = 12, height = 7)
ggsave(gg, filename = "Figures/potential_mixtures_selected_3.pdf", width = 12, height = 7)

Consensus background of common intrahost variants

To help ascertain whether common intrahost variants are the result of independent mutation events rather than transmission we generate heatmaps for a selection of variants which indicate the genomic background in which they occur. If these variants were the result of transmission we would expect to see the genetic background of fixed variants after aligning to the reference. That is, given an intrahost variant we would expect to see highly similar consensus sequences. The heatmaps below indicate that this is generally not the case and thus independent mutation events better explain many of the shared intrahost variants between samples.

shearwater_calls_no_rep <- shearwater_calls %>% filter(!sampleID %in% repeated_samples)
shearwater_calls_no_rep$pos_change <- sprintf("%s%i%s", shearwater_calls_no_rep$ref, 
    shearwater_calls_no_rep$pos, shearwater_calls_no_rep$mut)

plot_mixed_vaf <- function(pos) {
    temp_samples <- unique(shearwater_calls_no_rep$sampleID[(shearwater_calls_no_rep$pos_change == 
        pos) & (shearwater_calls_no_rep$vaf < 0.95) & (shearwater_calls_no_rep$vaf > 
        0)])
    pdf <- shearwater_calls_no_rep %>% filter(sampleID %in% temp_samples)
    pdf$pos_change <- factor(pdf$pos_change, levels = c(pos, unique(pdf$pos_change)[unique(pdf$pos_change) != 
        pos]))
    gg <- ggplot(pdf, aes(x = pos_change, y = sampleID, fill = vaf)) + geom_tile() + 
        theme_bw(base_size = 14) + theme(axis.text.x = element_blank(), axis.ticks.x = element_blank()) + 
        ylab("sample") + xlab("variant") + # scale_fill_stepsn(colours =
    # c('#ca0020','#f4a582','#f7f7f7','#92c5de','#0571b0'), breaks=c(0.25,0.5,0.75))
    # +
    scale_fill_binned(type = "viridis", breaks = c(0.25, 0.5, 0.75)) + ggtitle(pos)
    gg
    return(gg)
}

plot_mixed_vaf("A13780T")

ggsave("./Figures/heatmap_A13780T.png", height = 7, width = 10)

plot_mixed_vaf("C683T")

ggsave("./Figures/heatmap_C683T.png", height = 7, width = 10)

plot_mixed_vaf("A1547T")

ggsave("./Figures/heatmap_A1547T.png", height = 7, width = 10)

We call lineages using pangolin v1.1.14 and database date 2020-05-19.

lineage_calls <- fread("/Users/gt4/Documents/covid_deep/data/consensus/combined_consensus_replicates_filt_lineage_report.csv", 
    data.table = FALSE) %>% as_tibble()
stopifnot(all(run_consensus_df$sample %in% lineage_calls$taxon))
run_consensus_df$lineage <- lineage_calls$lineage[match(run_consensus_df$sample, 
    lineage_calls$taxon)]

Pairwise transmission model

To investigate transmission, samples were only considered if both replicates produced high quality consensus genomes. When multiple samples from the same host were available the earliest sample was used. Pairwise SNP distances were generated between the consensus genomes using pairsnp v0.2.0 (Tonkin-Hill 2018). The distribution of the underlying number of intermediate transmission events between each pair of samples was then inferred using an implementation of the transcluster algorithm (Stimson et al. 2019; Tonkin-Hill 2020). The serial interval and evolutionary rate were set to 5 days and 1e-3 substitutions/site/year (Fauver et al. 2020; Zhang et al. 2020).

replicate_meta <- fread("./Processed_data/keep_replicates_meta.csv", data.table = FALSE, 
    sep = ",", fill = TRUE) %>% as_tibble()
consensus_seqs <- ape::read.dna("./data/consensus/combined_consensus_replicates_filt.fa", 
    format = "fasta")
meta <- fread("./data/replicate_meta.tsv", data.table = FALSE) %>% as_tibble()
date_info <- replicate_meta[, c("central_sample_id", "collection_date")] %>% filter(collection_date != 
    "None")

consensus_seqs_w_dates <- consensus_seqs[names(consensus_seqs) %in% date_info$central_sample_id]
ape::write.FASTA(consensus_seqs_w_dates, file = "./data/consensus/combined_consensus_replicates_filt_dates.fa")
fwrite(date_info, file = "./data/consensus/combined_consensus_replicates_filt_dates.csv", 
    quote = FALSE, sep = ",", col.names = TRUE)

Create the necessary multiple sequence alignment.

mafft --auto --thread -1 --keeplength --addfragments combined_consensus_replicates_filt_dates.fa nCoV-2019.reference.fasta > MA_combined_consensus_replicates_filt_dates.fa
temp_msa <- ape::read.dna("./data/consensus/MA_combined_consensus_replicates_filt_dates.fa", 
    format = "fasta")
temp_msa <- temp_msa[2:nrow(temp_msa), ]
ape::write.FASTA(temp_msa, file = "./data/consensus/MA_combined_consensus_replicates_filt_dates.fa")

Run the pairsnp and transcluster algorithms. These packages are available at https://github.com/gtonkinhill/pairsnp-cpp and https://github.com/gtonkinhill/fasttranscluster respectively.

python ~/fasttranscluster/fasttranscluster-runner.py --msa MA_combined_consensus_replicates_filt_dates.fa  --dates combined_consensus_replicates_filt_dates.csv --save_probs --snp_threshold 10 -K 20 -o ./

Load results and filter out repeated samples from the same patient.

trans_probs <- fread("./data/consensus/transcluster_probabilities.csv", data.table = FALSE) %>% 
    as_tibble()

# remove repeated samples
trans_probs <- trans_probs %>% filter(!sampleA %in% repeated_samples) %>% filter(!sampleB %in% 
    repeated_samples)

Transmission

Plot with possible mixed infections included.

temp <- shearwater_complex_calls %>% filter(!sampleID %in% repeated_samples) %>% 
    filter(r1_vaf < 0.99 & r2_vaf < 0.99)

high_prev <- names(table(paste(temp$pos, temp$mut)))[table(paste(temp$pos, temp$mut)) > 
    5]
temp$mut_pos <- paste(temp$pos, temp$mut)
temp$maf <- temp$vaf
temp$maf[temp$vaf > 0.5] <- temp$vaf_ref[temp$vaf > 0.5]
temp <- temp %>% filter(!mut_pos %in% high_prev)
complex_calls <- split(temp, temp$sampleID)

trans_vs_poly <- map_dfr(c(0.01, 0.02, 0.05, 0.1), function(min_maf) {
    trans_probs$shared_poly <- map_dbl(1:nrow(trans_probs), function(i) {
        if (!trans_probs$sampleA[[i]] %in% names(complex_calls)) 
            return(0)
        if (!trans_probs$sampleB[[i]] %in% names(complex_calls)) 
            return(0)
        mafA <- complex_calls[[trans_probs$sampleA[[i]]]] %>% filter(maf > min_maf)
        mafB <- complex_calls[[trans_probs$sampleB[[i]]]] %>% filter(maf > min_maf)
        return(sum(mafA$mut_pos %in% mafB$mut_pos))
    })
    trans_probs$min_maf <- min_maf
    return(trans_probs)
})

trans_vs_poly$`probability direct transmission` <- cut(exp(trans_vs_poly$`0`), breaks = seq(0, 
    0.4, 0.01))

temp <- trans_vs_poly %>% filter(min_maf == 0.1) %>% filter(exp(`0`) > 0.18) %>% 
    filter(shared_poly > 0)
length(unique(c(temp$sampleA, temp$sampleB)))
## [1] 6
trans_vs_poly <- trans_vs_poly %>% group_by(min_maf, `probability direct transmission`) %>% 
    summarise(mean_shared_poly = mean(shared_poly))
colnames(trans_vs_poly) <- c("MAF", "probability direct transmission", "mean shared minor variants")
trans_vs_poly$MAF <- paste("minimum MAF:", trans_vs_poly$MAF)

ggplot(trans_vs_poly, aes(x = `probability direct transmission`, y = `mean shared minor variants`)) + 
    geom_col() + facet_wrap(~MAF, ncol = 1) + theme_bw(base_size = 16) + theme(axis.text.x = element_text(angle = 45, 
    hjust = 1)) + ylab("mean pairwise shared variants")

ggsave(filename = "./Figures/pairwise_transmission_prob_vs_multi_maf_w_mix.pdf", 
    width = 9, height = 7)
ggsave(filename = "./Figures/pairwise_transmission_prob_vs_multi_maf_w_mix.png", 
    width = 9, height = 7)

plot with possible mixtures excluded

trans_probs <- fread("./data/consensus/transcluster_probabilities.csv", data.table = FALSE) %>% 
    as_tibble()

# remove repeated samples
trans_probs <- trans_probs %>% filter(!sampleA %in% repeated_samples) %>% filter(!sampleB %in% 
    repeated_samples)

# remove samples that are potentially mixed
mixed_samples <- mixture_tests_mm$sampleA

trans_probs <- trans_probs %>% filter(!sampleA %in% mixed_samples) %>% filter(!sampleB %in% 
    mixed_samples)

trans_vs_poly <- map_dfr(c(0.01, 0.02, 0.05, 0.1), function(min_maf) {
    trans_probs$shared_poly <- map_dbl(1:nrow(trans_probs), function(i) {
        if (!trans_probs$sampleA[[i]] %in% names(complex_calls)) 
            return(0)
        if (!trans_probs$sampleB[[i]] %in% names(complex_calls)) 
            return(0)
        mafA <- complex_calls[[trans_probs$sampleA[[i]]]] %>% filter(maf > min_maf)
        mafB <- complex_calls[[trans_probs$sampleB[[i]]]] %>% filter(maf > min_maf)
        return(sum(mafA$mut_pos %in% mafB$mut_pos))
    })
    trans_probs$min_maf <- min_maf
    return(trans_probs)
})

trans_vs_poly$`probability direct transmission` <- cut(exp(trans_vs_poly$`0`), breaks = seq(0, 
    0.4, 0.01))

temp <- trans_vs_poly %>% filter(min_maf == 0.1) %>% filter(exp(`0`) > 0.18) %>% 
    filter(shared_poly > 0)
length(unique(c(temp$sampleA, temp$sampleB)))
## [1] 6
trans_vs_poly <- trans_vs_poly %>% group_by(min_maf, `probability direct transmission`) %>% 
    summarise(mean_shared_poly = mean(shared_poly))
colnames(trans_vs_poly) <- c("MAF", "probability direct transmission", "mean shared minor variants")
trans_vs_poly$MAF <- paste("minimum MAF:", trans_vs_poly$MAF)

ggplot(trans_vs_poly, aes(x = `probability direct transmission`, y = `mean shared minor variants`)) + 
    # geom_point(size=5) +
geom_col() + facet_wrap(~MAF, ncol = 1) + theme_bw(base_size = 16) + theme(axis.text.x = element_text(angle = 45, 
    hjust = 1)) + ylab("mean pairwise shared variants")

ggsave(filename = "./Figures/pairwise_transmission_prob_vs_multi_maf.pdf", width = 9, 
    height = 7)
ggsave(filename = "./Figures/pairwise_transmission_prob_vs_multi_maf.png", width = 9, 
    height = 7)

Position level statistics

To investigate the phylogentic signal present in the patterns of intra host varianst across the consensus phylogeny we generated binary presence/absence vectors to indicate which of the replicate pairs a minority variant occurred in. These were then used with the maximum likelihood phylogeny previously described to infer both the delta and D-statistic which indicates the concordance between the minority variant and the consensus phylogeny. The R package caper v0.2 was used to infer the D-statistic (Orme et al. 2013; Fritz and Purvis 2010). The delta statistic was calculated using the R code provided with the publication (Borges et al. 2019).

Delta statistic

source("./delta_statistic.R")

samples_considered <- unique(run_consensus_df$sample)[!unique(run_consensus_df$sample) %in% 
    mixture_tests_mm$sampleA]
samples_considered <- c(samples_considered, "MN908947.3")

ml_tree <- read.tree("./data/consensus/elan.20200529.consensus.filt.mask.tree")
ml_tree <- drop.tip(ml_tree, ml_tree$tip.label[!ml_tree$tip.label %in% samples_considered])
ml_tree <- root(ml_tree, outgroup = "MN908947.3", resolve.root = TRUE)
ml_tree <- multi2di(ml_tree)
ml_tree$edge.length <- pmax(ml_tree$edge.length, 1e-10)

plan(multiprocess)

tb <- table(run_consensus_df$POS[run_consensus_df$sample %in% samples_considered])

delta_results <- furrr::future_map_dfr(names(tb)[tb > 2], ~{
    tr <- ml_tree$tip.label %in% run_consensus_df$sample[run_consensus_df$POS == 
        .x]
    d <- delta(trait = tr, tree = ml_tree, lambda0 = 1, se = 1, sim = 10000, thin = 10, 
        burn = 5000)
    tibble(POS = .x, delta = d)
}, .progress = TRUE)
## 
 Progress: ────                                                                                                                                                  100%
 Progress: ─────                                                                                                                                                 100%
 Progress: ───────                                                                                                                                               100%
 Progress: ────────                                                                                                                                              100%
 Progress: ──────────                                                                                                                                            100%
 Progress: ───────────                                                                                                                                           100%
 Progress: ────────────                                                                                                                                          100%
 Progress: ──────────────                                                                                                                                        100%
 Progress: ────────────────                                                                                                                                      100%
 Progress: ────────────────                                                                                                                                      100%
 Progress: ────────────────                                                                                                                                      100%
 Progress: ──────────────────                                                                                                                                    100%
 Progress: ────────────────────                                                                                                                                  100%
 Progress: ──────────────────────                                                                                                                                100%
 Progress: ──────────────────────                                                                                                                                100%
 Progress: ────────────────────────                                                                                                                              100%
 Progress: ──────────────────────────                                                                                                                            100%
 Progress: ───────────────────────────                                                                                                                           100%
 Progress: ────────────────────────────                                                                                                                          100%
 Progress: ─────────────────────────────                                                                                                                         100%
 Progress: ─────────────────────────────                                                                                                                         100%
 Progress: ───────────────────────────────                                                                                                                       100%
 Progress: ─────────────────────────────────                                                                                                                     100%
 Progress: ──────────────────────────────────                                                                                                                    100%
 Progress: ────────────────────────────────────                                                                                                                  100%
 Progress: ─────────────────────────────────────                                                                                                                 100%
 Progress: ──────────────────────────────────────                                                                                                                100%
 Progress: ───────────────────────────────────────                                                                                                               100%
 Progress: ─────────────────────────────────────────                                                                                                             100%
 Progress: ──────────────────────────────────────────                                                                                                            100%
 Progress: ────────────────────────────────────────────                                                                                                          100%
 Progress: ────────────────────────────────────────────                                                                                                          100%
 Progress: ─────────────────────────────────────────────                                                                                                         100%
 Progress: ──────────────────────────────────────────────                                                                                                        100%
 Progress: ────────────────────────────────────────────────                                                                                                      100%
 Progress: ─────────────────────────────────────────────────                                                                                                     100%
 Progress: ───────────────────────────────────────────────────                                                                                                   100%
 Progress: ────────────────────────────────────────────────────                                                                                                  100%
 Progress: ──────────────────────────────────────────────────────                                                                                                100%
 Progress: ───────────────────────────────────────────────────────                                                                                               100%
 Progress: ────────────────────────────────────────────────────────                                                                                              100%
 Progress: ─────────────────────────────────────────────────────────                                                                                             100%
 Progress: ──────────────────────────────────────────────────────────                                                                                            100%
 Progress: ───────────────────────────────────────────────────────────                                                                                           100%
 Progress: ─────────────────────────────────────────────────────────────                                                                                         100%
 Progress: ────────────────────────────────────────────────────────────────                                                                                      100%
 Progress: ────────────────────────────────────────────────────────────────                                                                                      100%
 Progress: ─────────────────────────────────────────────────────────────────                                                                                     100%
 Progress: ───────────────────────────────────────────────────────────────────                                                                                   100%
 Progress: ────────────────────────────────────────────────────────────────────                                                                                  100%
 Progress: ─────────────────────────────────────────────────────────────────────                                                                                 100%
 Progress: ──────────────────────────────────────────────────────────────────────                                                                                100%
 Progress: ─────────────────────────────────────────────────────────────────────────                                                                             100%
 Progress: ─────────────────────────────────────────────────────────────────────────                                                                             100%
 Progress: ───────────────────────────────────────────────────────────────────────────                                                                           100%
 Progress: ─────────────────────────────────────────────────────────────────────────────                                                                         100%
 Progress: ──────────────────────────────────────────────────────────────────────────────                                                                        100%
 Progress: ──────────────────────────────────────────────────────────────────────────────                                                                        100%
 Progress: ────────────────────────────────────────────────────────────────────────────────                                                                      100%
 Progress: ─────────────────────────────────────────────────────────────────────────────────                                                                     100%
 Progress: ────────────────────────────────────────────────────────────────────────────────────                                                                  100%
 Progress: ────────────────────────────────────────────────────────────────────────────────────                                                                  100%
 Progress: ─────────────────────────────────────────────────────────────────────────────────────                                                                 100%
 Progress: ────────────────────────────────────────────────────────────────────────────────────────                                                              100%
 Progress: ─────────────────────────────────────────────────────────────────────────────────────────                                                             100%
 Progress: ─────────────────────────────────────────────────────────────────────────────────────────                                                             100%
 Progress: ──────────────────────────────────────────────────────────────────────────────────────────                                                            100%
 Progress: ────────────────────────────────────────────────────────────────────────────────────────────                                                          100%
 Progress: ──────────────────────────────────────────────────────────────────────────────────────────────                                                        100%
 Progress: ──────────────────────────────────────────────────────────────────────────────────────────────                                                        100%
 Progress: ───────────────────────────────────────────────────────────────────────────────────────────────                                                       100%
 Progress: ──────────────────────────────────────────────────────────────────────────────────────────────────                                                    100%
 Progress: ──────────────────────────────────────────────────────────────────────────────────────────────────                                                    100%
 Progress: ────────────────────────────────────────────────────────────────────────────────────────────────────                                                  100%
 Progress: ─────────────────────────────────────────────────────────────────────────────────────────────────────                                                 100%
 Progress: ───────────────────────────────────────────────────────────────────────────────────────────────────────                                               100%
 Progress: ───────────────────────────────────────────────────────────────────────────────────────────────────────                                               100%
 Progress: ─────────────────────────────────────────────────────────────────────────────────────────────────────────                                             100%
 Progress: ───────────────────────────────────────────────────────────────────────────────────────────────────────────                                           100%
 Progress: ────────────────────────────────────────────────────────────────────────────────────────────────────────────                                          100%
 Progress: ──────────────────────────────────────────────────────────────────────────────────────────────────────────────                                        100%
 Progress: ───────────────────────────────────────────────────────────────────────────────────────────────────────────────                                       100%
 Progress: ────────────────────────────────────────────────────────────────────────────────────────────────────────────────                                      100%
 Progress: ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────                                     100%
 Progress: ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────                                     100%
 Progress: ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────                                   100%
 Progress: ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────                                  100%
 Progress: ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────                               100%
 Progress: ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────                               100%
 Progress: ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────                               100%
 Progress: ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────                             100%
 Progress: ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────                            100%
 Progress: ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────                         100%
 Progress: ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────                         100%
 Progress: ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────                        100%
 Progress: ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────                       100%
 Progress: ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────                     100%
 Progress: ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────                    100%
 Progress: ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────                   100%
 Progress: ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────                  100%
 Progress: ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────                100%
 Progress: ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────               100%
 Progress: ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────             100%
 Progress: ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────            100%
 Progress: ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────            100%
 Progress: ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────          100%
 Progress: ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────         100%
 Progress: ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────        100%
 Progress: ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────       100%
 Progress: ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────       100%
 Progress: ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────      100%
 Progress: ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────    100%
 Progress: ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────   100%
 Progress: ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────   100%
 Progress: ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────   100%
 Progress: ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────   100%
 Progress: ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────  100%
 Progress: ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────  100%
 Progress: ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────  100%
 Progress: ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────  100%
 Progress: ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────  100%
 Progress: ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────  100%
 Progress: ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────  100%
 Progress: ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────  100%
 Progress: ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── 100%
delta_results$log1_delta <- log(delta_results$delta + 1)
delta_results <- delta_results %>% arrange(delta)

pos_counts <- run_consensus_df %>% group_by(POS) %>% summarise(count = n())

delta_results$count <- pos_counts$count[match(delta_results$POS, pos_counts$POS)]

ggplot(delta_results, aes(x = delta)) + geom_histogram() + theme_bw(base_size = 14)

ggsave(file = "./Figures/delta_histogram.png", width = 9, height = 7)
ggsave(file = "./Figures/delta_histogram.pdf", width = 9, height = 7)

write.csv(delta_results, file = "./Processed_data/delta_results.csv", quote = FALSE, 
    row.names = FALSE)

D statistic

bindata <- map_dfc(names(tb)[tb > 2], ~{
    ml_tree$tip.label %in% run_consensus_df$sample[run_consensus_df$POS == .x]
})
colnames(bindata) <- names(tb)[tb > 2]
bindata <- bindata * 1
bindata <- bindata %>% add_column(sample = ml_tree$tip.label, .before = 1)


bindata <- caper::comparative.data(ml_tree, bindata, sample)

saveRDS(bindata, "./Processed_data/bindata.RDS")
bindata <- readRDS("./Processed_data/bindata.RDS")

plan(multiprocess)

phylo.d <- function(data, phy, names.col, binvar, permut = 1000, rnd.bias = NULL) {
    if (!missing(data)) {
        if (!inherits(data, "comparative.data")) {
            if (missing(names.col)) 
                stop("names column is missing")
            names.col <- deparse(substitute(names.col))
            data <- caicStyleArgs(data = data, phy = phy, names.col = names.col)
        }
    }
    # binvar <- deparse(substitute(binvar))
    bininds <- match(binvar, names(data$data))
    if (is.na(bininds)) 
        (stop("'", binvar, "' is not a variable in data."))
    ds <- data$data[, bininds]
    if (any(is.na(ds))) 
        stop("'", binvar, "' contains missing values.")
    if (is.character(ds)) 
        ds <- as.factor(ds)
    if (length(unique(ds)) > 2) 
        stop("'", binvar, "' contains more than two states.")
    if (length(unique(ds)) < 2) 
        stop("'", binvar, "' only contains a single state.")
    propStates <- unclass(table(ds))
    propState1 <- propStates[1]/sum(propStates)
    names(dimnames(propStates)) <- binvar
    if (is.factor(ds)) 
        ds <- as.numeric(ds)
    if (!is.numeric(permut)) 
        (stop("'", permut, "' is not numeric."))
    if (!is.null(rnd.bias)) {
        rnd.bias <- deparse(substitute(rnd.bias))
        rnd.ind <- match(rnd.bias, names(data$data))
        if (is.na(rnd.ind)) 
            (stop("'", rnd.bias, "' is not a variable in data."))
        rnd.bias <- data$data[, rnd.bias]
    }
    el <- data$phy$edge.length
    elTip <- data$phy$edge[, 2] <= length(data$phy$tip.label)
    if (any(el[elTip] == 0)) 
        stop("Phylogeny contains pairs of tips on zero branch lengths, cannot currently simulate")
    if (any(el[!elTip] == 0)) 
        stop("Phylogeny contains zero length internal branches. Use di2multi.")
    ds.ran <- replicate(permut, sample(ds, prob = rnd.bias))
    if (is.null(data$vcv)) {
        vcv <- VCV.array(data$phy)
    } else {
        vcv <- data$vcv
    }
    ds.phy <- rmvnorm(permut, sigma = unclass(vcv))
    ds.phy <- as.data.frame(t(ds.phy))
    ds.phy.thresh <- apply(ds.phy, 2, quantile, propState1)
    ds.phy <- sweep(ds.phy, 2, ds.phy.thresh, "<")
    ds.phy <- as.numeric(ds.phy)
    dim(ds.phy) <- dim(ds.ran)
    ds.ran <- cbind(Obs = ds, ds.ran)
    ds.phy <- cbind(Obs = ds, ds.phy)
    dimnames(ds.ran) <- dimnames(ds.phy) <- list(data$phy$tip.label, c("Obs", paste("V", 
        1:permut, sep = "")))
    phy <- reorder(data$phy, "pruningwise")
    ds.ran.cc <- contrCalc(vals = ds.ran, phy = phy, ref.var = "V1", picMethod = "phylo.d", 
        crunch.brlen = 0)
    ds.phy.cc <- contrCalc(vals = ds.phy, phy = phy, ref.var = "V1", picMethod = "phylo.d", 
        crunch.brlen = 0)
    ransocc <- colSums(ds.ran.cc$contrMat)
    physocc <- colSums(ds.phy.cc$contrMat)
    if (round(ransocc[1], digits = 6) != round(physocc[1], digits = 6)) 
        stop("Problem with character change calculation in phylo.d")
    obssocc <- ransocc[1]
    ransocc <- ransocc[-1]
    physocc <- physocc[-1]
    soccratio <- (obssocc - mean(physocc))/(mean(ransocc) - mean(physocc))
    soccpval1 <- sum(ransocc < obssocc)/permut
    soccpval0 <- sum(physocc > obssocc)/permut
    dvals <- list(DEstimate = soccratio, Pval1 = soccpval1, Pval0 = soccpval0, Parameters = list(Observed = obssocc, 
        MeanRandom = mean(ransocc), MeanBrownian = mean(physocc)), StatesTable = propStates, 
        Permutations = list(random = ransocc, brownian = physocc), NodalVals = list(observed = ds.ran.cc$nodVal[, 
            1, drop = FALSE], random = ds.ran.cc$nodVal[, -1, drop = FALSE], brownian = ds.phy.cc$nodVal[, 
            -1, drop = FALSE]), binvar = binvar, data = data, nPermut = permut, rnd.bias = rnd.bias)
    class(dvals) <- "phylo.d"
    return(dvals)
}

Dstat_results <- furrr::future_map_dfr(colnames(bindata$data), ~{
    ds <- phylo.d(bindata, ml_tree, binvar = .x)
    tibble(pos = .x, D_stat = ds$DEstimate, prob_rand = ds$Pval1, prob_brow = ds$Pval0)
})
write.csv(Dstat_results, "./Processed_data/Dstat_results.csv", row.names = FALSE, 
    quote = FALSE)

combine into table

Dstat_results <- fread("./Processed_data/Dstat_results.csv", data.table = FALSE) %>% 
    as_tibble()
concordance_stats <- merge(delta_results, Dstat_results, by.x = "POS", by.y = "pos", 
    all = TRUE) %>% as_tibble()
concordance_stats$log1_delta <- NULL
colnames(concordance_stats) <- c("position", "delta", "MAF count", "D statistic", 
    "D probability random", "D probability brownian")
concordance_stats <- concordance_stats[, c("position", "delta", "D statistic", "D probability random", 
    "D probability brownian", "MAF count")]
write.csv(concordance_stats, file = "Processed_data/concordance_statistics.csv", 
    quote = FALSE, row.names = FALSE)
knitr::kable(concordance_stats)
position delta D statistic D probability random D probability brownian MAF count
10029 922.169005 0.2889918 0.034 0.160 5
10076 906.579608 0.7788818 0.435 0.019 3
10252 34.238295 2.0590863 0.934 0.000 4
10323 20.060643 0.6586141 0.182 0.006 7
10369 33.919858 0.7769929 0.333 0.005 6
10507 895.803689 2.0701430 0.961 0.000 4
106 47.056203 1.8846633 0.908 0.002 3
10626 31.250973 1.3354581 0.778 0.000 5
10702 16.346129 0.6890390 0.358 0.033 3
10755 904.974023 0.9468429 0.544 0.009 3
10986 41.850203 1.4081788 0.800 0.000 4
11071 460.150452 0.9670275 0.531 0.001 4
11074 5.806614 0.7854052 0.131 0.000 23
1108 916.869713 -0.9171246 0.000 0.999 5
11083 4.219420 0.8523321 0.112 0.000 54
11152 904.226865 1.6681246 0.845 0.001 3
11224 39.781636 1.1497168 0.668 0.002 4
11595 26.138538 1.4058260 0.778 0.001 3
12164 7.236215 1.1097504 0.653 0.000 7
12396 460.362242 1.5772495 0.833 0.002 3
12578 27.816617 1.0514094 0.599 0.000 8
12789 903.274064 1.0454830 0.606 0.004 3
1288 24.590686 0.5640706 0.272 0.064 3
13458 46.382286 1.7416906 0.875 0.002 4
13571 8.558075 0.9191850 0.415 0.000 13
13780 14.429531 0.6191912 0.025 0.000 18
14183 35.696348 0.5053818 0.221 0.094 3
14277 46.913249 0.7378212 0.394 0.029 3
14411 23.552494 3.3055198 0.999 0.000 4
14599 13.458579 1.7430929 0.885 0.001 3
14790 21.720046 1.3391999 0.806 0.001 5
14805 31.537316 1.3359925 0.760 0.000 15
14928 896.233160 -0.9962628 0.000 1.000 4
15004 47.252572 1.2024500 0.696 0.004 3
1547 7.509224 0.8817779 0.220 0.000 35
15720 24.698824 0.6660716 0.287 0.029 4
15738 15.502143 0.4457783 0.196 0.101 3
15842 897.731312 0.4683315 0.161 0.080 4
16111 38.020458 0.1378600 0.055 0.352 3
16332 894.854330 1.9797325 0.919 0.000 3
16375 17.062077 1.2148665 0.823 0.000 13
16393 713.642300 0.9375319 0.522 0.003 4
16438 918.727623 0.8064141 0.375 0.003 5
16466 17.949630 1.2311658 0.815 0.000 14
16474 29.185294 1.0100562 0.567 0.002 5
16887 21.766963 0.4905783 0.008 0.012 13
16949 31.978245 0.8331507 0.388 0.005 6
17012 31.089600 0.8861878 0.500 0.009 3
17013 338.063868 0.7563616 0.330 0.010 6
17074 18.515522 0.9220798 0.476 0.002 5
17167 22.446415 0.8193273 0.282 0.001 13
17252 927.354404 0.9768491 0.555 0.008 3
17321 910.678323 0.9760400 0.556 0.012 3
17410 921.098672 0.5210627 0.163 0.058 4
17440 46.965205 1.4276433 0.778 0.003 3
17456 23.668454 2.0450311 0.922 0.000 4
17550 34.566605 0.8324486 0.414 0.008 5
17676 40.726470 1.2842821 0.749 0.000 5
17678 908.614841 0.2315699 0.071 0.266 3
17731 610.022560 -0.0562745 0.015 0.584 3
17795 19.524504 1.1932256 0.686 0.001 4
18028 913.610410 1.0682968 0.616 0.006 3
18555 899.060274 1.5076961 0.809 0.000 4
186 37.215302 0.4626009 0.123 0.065 5
18744 912.800721 0.9844032 0.571 0.007 3
18877 35.577666 0.8819918 0.473 0.007 4
1912 904.698722 0.9902185 0.537 0.000 6
19269 916.020672 1.6826850 0.890 0.000 4
19679 39.523938 0.8464676 0.473 0.022 3
203 8.272362 1.0469806 0.606 0.000 15
20578 47.086819 0.7468074 0.391 0.030 4
20627 28.441100 1.1606003 0.676 0.002 4
20759 18.007787 1.0343056 0.579 0.004 4
20762 903.303327 0.8766545 0.487 0.014 3
20922 16.868402 1.0618966 0.616 0.004 4
20925 52.493699 1.1190603 0.637 0.006 3
21137 21.098065 1.1067612 0.691 0.000 12
2144 55.981400 1.5891500 0.828 0.001 3
21575 21.401315 1.6091362 0.979 0.000 15
21622 34.546693 0.6739179 0.355 0.047 3
21707 912.832652 -0.1996671 0.007 0.708 3
21752 45.751451 0.7586685 0.413 0.034 3
2189 43.294889 1.8617404 0.904 0.000 3
22104 39.590112 0.8421468 0.475 0.015 3
222 904.011964 0.7765236 0.441 0.024 3
22899 23.526642 1.0334498 0.574 0.000 9
23029 25.322181 1.3845725 0.761 0.001 3
23161 48.367080 0.5947332 0.289 0.043 3
23189 42.483799 0.6016069 0.308 0.054 3
23191 643.528912 -0.0226213 0.018 0.561 3
23202 31.727734 0.7876710 0.390 0.015 4
23242 47.130087 1.6626637 0.863 0.001 3
23243 46.565709 0.9617360 0.557 0.011 3
23248 23.210448 0.9885911 0.550 0.004 4
23271 46.933662 0.8391196 0.478 0.027 3
23272 47.137930 0.4058863 0.177 0.144 3
23277 26.954562 0.3524829 0.096 0.122 4
23278 10.824628 0.8523076 0.453 0.003 4
23280 38.010677 0.6663094 0.267 0.011 5
23282 46.276262 0.9441398 0.524 0.011 4
23285 46.655767 0.9228932 0.516 0.011 3
23310 47.066340 0.7207019 0.377 0.032 3
23311 907.275661 0.5264970 0.197 0.061 4
23435 18.031532 1.0004395 0.555 0.000 6
23525 28.639411 0.4442847 0.100 0.063 6
23868 38.216322 1.5100397 0.807 0.001 3
23987 15.171188 0.2667905 0.050 0.213 4
241 47.202898 2.9034475 0.981 0.000 16
24138 611.683294 0.6066293 0.312 0.039 3
24213 10.977988 1.2727473 0.830 0.000 11
24381 900.006527 0.6823688 0.348 0.032 3
2453 903.310090 -0.9881991 0.000 1.000 4
24781 909.726101 -0.8832196 0.000 0.969 3
24899 14.111114 0.6006265 0.287 0.068 3
24933 908.956592 0.8644515 0.366 0.000 9
25000 35.308871 1.1283243 0.659 0.000 5
25036 918.931021 0.6163239 0.316 0.052 3
25096 904.548738 0.2635194 0.091 0.221 3
25169 916.284864 2.2396352 0.951 0.000 5
25521 8.893667 1.2062823 0.830 0.000 20
25572 34.810932 1.3631463 0.776 0.003 3
25613 917.232161 1.3739975 0.756 0.001 3
25728 47.665211 1.8599714 0.893 0.000 3
25889 19.500558 1.3352943 0.795 0.001 4
25996 919.345814 1.7056470 0.860 0.003 3
26137 6.748433 0.8627953 0.113 0.000 50
26270 22.773422 1.8003898 0.916 0.000 4
26333 8.554091 0.9343625 0.485 0.000 7
26408 47.217866 1.5946631 0.843 0.000 3
26526 903.180683 0.2312238 0.092 0.266 3
26533 913.523868 1.4092581 0.769 0.001 4
26681 38.378944 1.0019230 0.541 0.000 7
26735 904.511829 1.7548477 0.867 0.000 3
26768 36.819742 2.2141653 0.976 0.000 6
26801 20.945421 0.7359306 0.309 0.006 5
26858 46.569481 0.4650056 0.198 0.110 4
26895 536.050587 0.6203520 0.266 0.033 5
26927 21.217970 0.7373354 0.397 0.030 3
27213 42.044558 0.7905364 0.383 0.012 4
27297 46.620802 0.8961628 0.497 0.005 4
27384 20.143295 1.3463073 0.810 0.000 7
27390 52.982767 0.2729931 0.107 0.236 3
27434 12.462871 0.7759181 0.434 0.025 3
27476 16.767318 0.7926654 0.436 0.025 3
27493 890.270021 1.2159817 0.706 0.003 3
27876 917.066349 -0.2164479 0.008 0.739 3
27877 916.835971 -0.2378501 0.009 0.760 3
27920 40.861877 0.6959937 0.319 0.024 5
27925 20.992935 1.0727282 0.620 0.000 6
28001 32.363236 0.2000732 0.084 0.322 3
28077 913.168074 2.1438350 0.937 0.000 3
28079 16.366369 0.6299436 0.150 0.010 7
28086 46.759769 1.2421113 0.719 0.000 3
28087 24.592351 0.8613318 0.443 0.006 4
28093 901.509063 2.3999901 0.960 0.000 3
28253 7.986339 1.1458794 0.885 0.000 52
28310 915.710318 2.1325025 0.944 0.000 3
28377 906.871995 1.0835206 0.619 0.006 4
28603 26.379759 1.0751980 0.632 0.000 10
28744 589.794269 0.0391932 0.011 0.482 4
28770 59.025294 0.0593732 0.034 0.450 3
28826 10.871870 0.1225743 0.000 0.362 8
28854 33.657232 1.1900244 0.693 0.001 5
28881 17.365026 1.2870147 0.775 0.000 16
28882 17.087211 1.3175207 0.771 0.000 15
28883 17.080275 1.3356146 0.785 0.000 17
28887 9.251802 0.9709223 0.517 0.000 8
29041 908.196895 0.8351584 0.450 0.021 3
29095 895.684147 1.2506875 0.693 0.005 3
29160 908.532255 0.6119807 0.315 0.047 3
29200 46.974102 2.1946723 0.934 0.000 3
29242 911.164435 0.9815830 0.496 0.000 9
29247 17.546656 2.6387378 0.969 0.000 3
29253 9.160738 0.9487779 0.511 0.006 4
29272 511.342743 0.3832750 0.168 0.152 3
29274 46.937769 0.9036224 0.506 0.007 3
29311 41.476842 0.4627240 0.134 0.065 5
29421 521.195248 0.3205215 0.125 0.186 3
29543 47.122681 0.7796417 0.422 0.034 3
29555 24.481053 1.4674829 0.799 0.002 3
29635 907.365292 1.4327927 0.818 0.000 4
29640 915.174925 0.3231262 0.124 0.190 3
29642 905.396492 2.2900685 0.942 0.000 3
29686 909.991881 2.0812054 0.930 0.000 3
29700 46.544495 0.8482502 0.468 0.017 3
29730 28.443528 0.2773870 0.045 0.181 4
29742 914.323475 1.0001127 0.585 0.010 3
29743 440.737131 0.5434769 0.209 0.050 5
29750 897.004770 1.6027514 0.851 0.000 4
29754 10.051708 0.9748936 0.534 0.002 5
29774 31.463890 0.7765144 0.379 0.015 4
29779 13.724440 1.0916799 0.642 0.002 3
29781 904.168434 1.0112811 0.585 0.006 4
3096 7.861504 1.1572549 0.766 0.000 15
313 904.352748 0.2976671 0.125 0.210 3
3176 914.390074 1.7797672 0.881 0.000 3
3177 378.277723 0.5786564 0.241 0.041 4
3259 924.715684 0.8467935 0.443 0.010 5
337 18.734333 0.4466009 0.199 0.118 3
3425 576.759090 -0.2167123 0.000 0.761 3
346 33.331726 1.8512194 0.922 0.000 5
3564 45.186188 0.9911267 0.559 0.015 4
3877 14.804358 0.4382944 0.093 0.059 5
4399 899.162216 0.7977106 0.446 0.021 3
4505 46.974473 0.7738092 0.429 0.026 3
462 907.884544 0.2866699 0.051 0.196 4
4815 47.145122 0.5622523 0.271 0.060 3
558 25.641086 0.8661731 0.366 0.000 9
5622 919.703200 -0.1710916 0.011 0.692 3
5812 26.006750 0.6199765 0.318 0.055 3
5826 917.320630 1.2414954 0.712 0.003 3
5869 903.658552 1.0885543 0.648 0.004 3
6121 908.128294 2.0983190 0.935 0.000 3
6286 47.381993 0.5391345 0.253 0.063 3
6317 911.779172 0.6748436 0.315 0.015 4
635 4.442886 0.9107739 0.259 0.000 42
643 20.875013 1.8036783 0.887 0.001 3
6513 46.928088 0.7790056 0.439 0.023 3
6539 42.690776 1.8057689 0.911 0.000 4
683 8.060202 1.0364621 0.610 0.000 28
7420 896.710792 1.5965251 0.829 0.001 3
745 902.287980 0.5546772 0.271 0.082 3
7504 890.069769 -0.0471217 0.015 0.577 3
7528 535.641122 0.1986550 0.031 0.304 5
7735 11.247375 1.3358091 0.794 0.001 6
7765 16.723721 0.8295135 0.479 0.014 3
7768 928.637960 0.7158644 0.378 0.036 3
7851 905.646368 0.5982510 0.305 0.054 3
8318 42.816698 0.6469713 0.282 0.028 4
835 902.733849 0.7618534 0.407 0.026 3
851 900.828356 1.0909988 0.618 0.002 3
9052 53.114848 0.3836342 0.158 0.159 3
9165 30.831698 1.1215850 0.659 0.000 7
9180 37.422018 0.7114883 0.293 0.008 5
9286 46.888588 0.5611465 0.261 0.057 3
929 911.194296 0.8256940 0.461 0.021 3
9430 24.532582 0.9120408 0.395 0.000 13
9438 16.785460 1.4306946 0.949 0.000 15
9474 14.120403 0.7631943 0.257 0.001 8
9479 16.331890 0.5122737 0.077 0.018 7
9491 898.542753 0.8109986 0.198 0.000 18
9519 924.559804 1.6478497 0.847 0.000 3
9561 44.939676 2.1842099 0.953 0.000 3
9603 918.395384 1.1234314 0.634 0.006 3
9629 47.469329 1.9568510 0.906 0.000 3
9634 912.989998 1.1172785 0.640 0.005 3
9737 912.922522 0.7948590 0.358 0.006 5
9741 905.073732 1.5025152 0.806 0.002 3
9848 25.774409 0.0509467 0.001 0.471 5
9866 50.138510 0.3719184 0.099 0.130 4
9924 919.539635 0.9524293 0.543 0.009 3

Within host variation increases over time

To investigate the possible accumulation of de novo mutations during the course of an infection, we studied 43 individuals for whom we had multiple samples collected at different timepoints. Initially, we collect the VAFs inferred using the shearwater algorithm. Unlike in much of the analysis above, we allow for more complex variants rather than just single nucleotude polymorphisms.

MINMAF <- 0.01
MINDEPTH <- 1000

load("./data/allele_counts_allsites_allsamples.rda")

rownames(countsall) <- fread("./data/sample_list_1181.tsv", header = FALSE, data.table = FALSE)$V1
colnames(countsall) <- 1:ncol(countsall)
dimnames(countsall)[[3]] <- c("A", "T", "C", "G", "-", "INS")

multi_run_calls <- shearwater_complex_calls %>% filter(r1_vaf < 1 & r2_vaf < 1) %>% 
    filter(sampleID %in% unlist(meta_multi$samples))

multi_run_calls$host <- replicate_meta$biosample_source_id[match(multi_run_calls$sampleID, 
    replicate_meta$central_sample_id)]
multi_run_calls$date <- replicate_meta$collection_date[match(multi_run_calls$sampleID, 
    replicate_meta$central_sample_id)]
multi_run_calls <- multi_run_calls %>% filter(date != "None")
temp <- table(multi_run_calls$host[!duplicated(multi_run_calls$sampleID)])
multi_run_calls <- multi_run_calls %>% filter(!multi_run_calls$host %in% names(temp)[temp < 
    2])
multi_run_calls$date <- as_date(multi_run_calls$date)
multi_run_calls$mut_simple <- multi_run_calls$mut
multi_run_calls$mut_simple[!multi_run_calls$mut_simple %in% c("-", "T", "G", "C", 
    "A", "INS")] <- map_chr(multi_run_calls$mut_simple[!multi_run_calls$mut_simple %in% 
    c("-", "T", "G", "C", "A", "INS")], ~{
    substr(.x, 1, 1)
})

multi_run_calls_recovered <- multi_run_calls %>% group_by(host) %>% group_map(~{
    samples <- unique(.x$sampleID)
    nsamples <- length(samples)
    
    mis <- table(paste(.x$mut_simple, .x$pos))
    mis <- .x[paste(.x$mut_simple, .x$pos) %in% names(mis)[mis != nsamples], ]
    
    missing <- map_dfr(1:nrow(mis), function(i) {
        missing_samples <- unique(samples[!samples %in% mis$sampleID[(mis$pos == 
            mis$pos[[i]]) & (mis$mut_simple[[i]] == mis$mut_simple)]])
        vafs <- map_dfr(missing_samples, function(s) {
            vaf <- countsall[s, mis$pos[[i]], mis$mut_simple[[i]]]/sum(countsall[s, 
                mis$pos[[i]], ])
            countsall["CAMB-76937", 6855, "T"]
            sum(countsall["CAMB-76937", 6855, ])
            if (vaf > 0.001) {
                temp <- mis[i, ]
                temp$sampleID <- s
                temp$vaf <- vaf
                temp$date <- unique(replicate_meta$collection_date[replicate_meta$central_sample_id == 
                  s])
                return(temp)
            } else {
                return(tibble())
            }
        })
        return(vafs)
    })
    
    return(missing)
    
}, keep = TRUE)

multi_run_calls_recovered <- do.call(rbind, multi_run_calls_recovered)
multi_run_calls_recovered <- multi_run_calls_recovered[!duplicated(multi_run_calls_recovered[, 
    c("sampleID", "pos", "mut")]), ]
multi_run_calls_recovered <- rbind(multi_run_calls, multi_run_calls_recovered)

all_consensus <- multi_run_calls_recovered %>% group_by(pos, mut, host) %>% summarise(allcons = all(vaf > 
    0.75)) %>% filter(allcons)
all_consensus$combined <- sprintf("%s %s %s", all_consensus$pos, all_consensus$mut, 
    all_consensus$host)

multi_run_calls_recovered <- multi_run_calls_recovered %>% filter(!paste(pos, mut, 
    host) %in% all_consensus$combined)

Generate time series plots of the inferred VAFs within each patient. Multiple samples on the same day are ‘jittered’ to enable the variation within a day to be observed.

all_meta <- fread("./data/majora.20200501.metadata.tsv") %>% as_tibble() %>% filter(central_sample_id %in% 
    multi_run_calls_recovered$sampleID)
multi_run_calls_recovered$swab_site <- all_meta$swab_site[match(multi_run_calls_recovered$sampleID, 
    all_meta$central_sample_id)]
multi_run_calls_recovered$sample_type_collected <- all_meta$sample_type_collected[match(multi_run_calls_recovered$sampleID, 
    all_meta$central_sample_id)]

plotdf <- multi_run_calls_recovered
plotdf$mut_pos <- paste(plotdf$mut_simple, plotdf$pos)

hostdaycount <- table(paste(plotdf$host, plotdf$date)[!duplicated(plotdf$sampleID)])
samedaycount <- unlist(map(paste(plotdf$host, plotdf$date), ~{
    ids <- unique(plotdf$sampleID[paste(plotdf$host, plotdf$date) == .x])
    ids <- setNames(1:length(ids), ids)
}))
min_date <- plotdf %>% group_by(host) %>% summarise(min_dat = min(date))
min_date <- setNames(min_date$min_dat, min_date$host)
plotdf$date_adj <- plotdf$date - min_date[plotdf$host]
plotdf$date_adj <- plotdf$date_adj + (samedaycount[plotdf$sampleID] - 1) * 0.1 - 
    (hostdaycount[paste(plotdf$host, plotdf$date)] - 1) * 0.05

ggplot(plotdf, aes(x = date_adj, y = vaf, col = mut_pos)) + geom_line(alpha = 0.5) + 
    geom_point(alpha = 0.5) + facet_wrap(~host, scales = "free_x") + scale_color_discrete(guide = FALSE) + 
    scale_x_continuous(breaks = 0:9) + scale_y_sqrt() + theme_bw(base_size = 14) + 
    xlab("days from first sample") + ylab("VAF")

ggsave(filename = "./Figures/vaf_by_repeated_sample.pdf", width = 15, height = 10)
ggsave(filename = "./Figures/vaf_by_repeated_sample.png", width = 15, height = 10)

To focus on the variance within samples we plot the number of shared variants versus the total number of minority variants for each pairwise combination of samples taken from the same patient on the same day. We also plot the proportion of shared variants between each pairwise combination, split by the different sampling techniques used.

same_days <- table(paste(multi_run_calls_recovered$host, multi_run_calls_recovered$date)[!duplicated(multi_run_calls_recovered$sampleID)])
same_days <- same_days[same_days > 1]

plotdf <- multi_run_calls_recovered[paste(multi_run_calls_recovered$host, multi_run_calls_recovered$date) %in% 
    names(same_days), ]
plotdf$mut_pos <- paste(plotdf$mut, plotdf$pos)

plotdf2 <- plotdf %>% group_by(host, date) %>% summarise(sample_type = paste(sort(unique(sample_type_collected)), 
    collapse = "_"), n_samples = length(unique(sampleID)), n_shared = sum(table(mut_pos) == 
    length(unique(sampleID))), mean_shared = mean(table(mut_pos)/length(unique(sampleID))), 
    ids = list(unique(sampleID)), tm = list(table(mut_pos)), n_muts = length(unique(mut_pos)), 
    prop_shared = sum(table(mut_pos) == length(unique(sampleID)))/length(unique(mut_pos)))

gg1 <- ggplot(plotdf2, aes(x = sample_type, y = prop_shared)) + geom_boxplot(outlier.colour = NA) + 
    geom_jitter(height = 0, width = 0.1, size = 2) + theme_bw(base_size = 14) + xlab("sample types") + 
    ylab("proportion of shared minority variants")
gg1

ggsave(filename = "./Figures/swab_type_vs_prop_shared.pdf", width = 9, height = 7)
ggsave(filename = "./Figures/swab_type_vs_prop_shared.png", width = 9, height = 7)

gg2 <- ggplot(plotdf2, aes(x = n_shared, y = n_muts)) + geom_jitter(height = 0.1, 
    width = 0.1, size = 2) + theme_bw(base_size = 14) + xlab("number of shared minority variant") + 
    ylab("total number of minority variants")
gg2

ggsave(filename = "./Figures/maf_total_vs_shared.pdf", width = 9, height = 7)
ggsave(filename = "./Figures/maf_total_vs_shared.png", width = 9, height = 7)

gg1 + gg2 + patchwork::plot_layout(nrow = 1)

# test excluding samples with more than two to avoid bias
t.test(plotdf2$prop_shared[plotdf2$n_samples == 2 & plotdf2$sample_type == "swab"], 
    plotdf2$prop_shared[plotdf2$n_samples == 2 & plotdf2$sample_type == "sputum_swab"], 
    alternative = "greater")
## 
##  Welch Two Sample t-test
## 
## data:  plotdf2$prop_shared[plotdf2$n_samples == 2 & plotdf2$sample_type ==  and plotdf2$prop_shared[plotdf2$n_samples == 2 & plotdf2$sample_type ==     "swab"] and     "sputum_swab"]
## t = 0.8484, df = 9.6287, p-value = 0.2084
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
##  -0.1416669        Inf
## sample estimates:
## mean of x mean of y 
## 0.4226266 0.2988684
# alternative strategy of including the number of samples in the model leads to a
# similar result.
plotdf2$isswab <- plotdf2$sample_type == "swab"
summary(glm(n_shared ~ isswab + n_samples + n_muts, data = plotdf2, family = "poisson"))
## 
## Call:
## glm(formula = n_shared ~ isswab + n_samples + n_muts, family = "poisson", 
##     data = plotdf2)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.9422  -0.9009  -0.4691   0.5778   2.1638  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  2.312078   0.556106   4.158 3.22e-05 ***
## isswabTRUE   0.213813   0.207103   1.032    0.302    
## n_samples   -0.286469   0.260141  -1.101    0.271    
## n_muts       0.003482   0.011060   0.315    0.753    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 30.292  on 16  degrees of freedom
## Residual deviance: 26.173  on 13  degrees of freedom
## AIC: 95.326
## 
## Number of Fisher Scoring iterations: 5

Finally, we plot the difference in the number of withinhost minority variants between each pairwise combination of samples where the x-axis indicates the number of days seperating those samples.

pairwise_within_host_maf_diff <- map_dfr(unique(multi_run_calls_recovered$host), 
    ~{
        print(.x)
        temp <- multi_run_calls %>% filter(host == .x) %>% group_by(sampleID, date) %>% 
            summarise(maf_count = n()) %>% arrange(date)
        
        if (nrow(temp) < 2) 
            return(tibble())
        
        pw <- combinat::combn(nrow(temp), 2, simplify = FALSE)
        maf_diff <- map_dfr(pw, function(p) {
            tibble(date_diff = temp$date[[p[[2]]]] - temp$date[[p[[1]]]], maf_diff = temp$maf_count[[p[[2]]]] - 
                temp$maf_count[[p[[1]]]])
        }) %>% group_by(date_diff) %>% summarise(mean_maf_diff = mean(maf_diff))
        maf_diff$host <- .x
        return(maf_diff)
    })
## [1] "CAMS001914"
## [1] "CAMS001867"
## [1] "CAMS001503"
## [1] "CAMS001487"
## [1] "CAMS001467"
## [1] "CAMS001424"
## [1] "CAMS000787"
## [1] "CAMS000737"
## [1] "CAMS000672"
## [1] "CAMS000668"
## [1] "CAMS001910"
## [1] "CAMS002063"
## [1] "CAMS001538"
## [1] "CAMS001604"
## [1] "CAMS001586"
## [1] "CAMS001653"
## [1] "CAMS001666"
## [1] "CAMS001664"
## [1] "CAMS001670"
## [1] "CAMS001662"
## [1] "CAMS001741"
## [1] "CAMS001715"
## [1] "CAMS001803"
## [1] "CAMS001997"
## [1] "CAMS002135"
## [1] "CAMS002279"
## [1] "CAMS002128"
## [1] "CAMS002472"
## [1] "CAMS002162"
## [1] "CAMS002291"
## [1] "CAMS002186"
## [1] "CAMS002480"
## [1] "CAMS002667"
## [1] "CAMS002621"
## [1] "CAMS002328"
## [1] "CAMS002327"
## [1] "CAMS002325"
## [1] "CAMS002414"
## [1] "CAMS002412"
## [1] "CAMS002435"
## [1] "CAMS002532"
gg3 <- ggplot(pairwise_within_host_maf_diff, aes(x = date_diff, y = mean_maf_diff, 
    group = date_diff)) + geom_boxplot(outlier.colour = NA) + geom_jitter(height = 0, 
    width = 0.2) + theme_bw(base_size = 14) + scale_x_continuous(breaks = 0:9) + 
    xlab("time difference between samples (days)") + ylab("difference in number of minor alleles")
gg3

model_data <- multi_run_calls_recovered %>% group_by(host, date, sampleID, sample_type_collected) %>% 
    summarise(maf_count = n())
model_data$adj_date <- as.numeric(model_data$date - min_date[model_data$host])

summary(glmer(maf_count ~ adj_date + sample_type_collected + (1 | host), family = "poisson", 
    data = model_data))
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: poisson  ( log )
## Formula: maf_count ~ adj_date + sample_type_collected + (1 | host)
##    Data: model_data
## 
##      AIC      BIC   logLik deviance df.resid 
##    653.5    666.0   -321.7    643.5       85 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.4443 -0.7832 -0.1684  0.7942  3.0867 
## 
## Random effects:
##  Groups Name        Variance Std.Dev.
##  host   (Intercept) 0.3254   0.5704  
## Number of obs: 90, groups:  host, 41
## 
## Fixed effects:
##                             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                  2.71420    0.18374  14.772  < 2e-16 ***
## adj_date                     0.07559    0.02784   2.715  0.00663 ** 
## sample_type_collectedsputum  0.07890    0.17547   0.450  0.65296    
## sample_type_collectedswab   -0.26338    0.16662  -1.581  0.11394    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##                   (Intr) adj_dt smpl_typ_cllctdsp
## adj_date          -0.091                         
## smpl_typ_cllctdsp -0.717 -0.064                  
## smpl_typ_cllctdsw -0.845 -0.037  0.805
ggsave(filename = "./Figures/repeated_samples_maf_count_vs_date.pdf", width = 9, 
    height = 7)
ggsave(filename = "./Figures/repeated_samples_maf_count_vs_date.png", width = 9, 
    height = 7)